home *** CD-ROM | disk | FTP | other *** search
- # A part of pdfrw (pdfrw.googlecode.com)
- # Copyright (C) 2006-2009 Patrick Maupin, Austin, Texas
- # MIT license -- See LICENSE.txt for details
-
- '''
- The PdfReader class reads an entire PDF file into memory and
- parses the top-level container objects. (It does not parse
- into streams.) The object subclasses PdfDict, and the
- document pages are stored in a list in the pages attribute
- of the object.
- '''
-
- from pdftokens import PdfTokens
- from pdfobjects import PdfDict, PdfArray, PdfName
- from pdfcompress import uncompress
-
- class PdfReader(PdfDict):
-
- class unresolved:
- # Used as a placeholder until we have an object.
- pass
-
- def readindirect(self, objnum, gennum):
- ''' Read an indirect object. If it has already
- been read, return it from the cache.
- '''
-
- def setobj(obj):
- # Store the new object in the dictionary
- # once we have its value
- record[1] = obj
-
- def ordinary(source, setobj, obj):
- # Deal with an ordinary (non-array, non-dict) object
- setobj(obj)
- return obj
-
- fdata, objnum, gennum = self.fdata, int(objnum), int(gennum)
- record = self.indirect_objects[fdata, objnum, gennum]
- if record[1] is not self.unresolved:
- return record[1]
-
- # Read the object header and validate it
- source = PdfTokens(fdata, record[0])
- objid = source.multiple(3)
- assert int(objid[0]) == objnum, objid
- assert int(objid[1]) == gennum, objid
- assert objid[2] == 'obj', objid
-
- # Read the object, and call special code if it starts
- # an array or dictionary
- obj = source.next()
- obj = self.special.get(obj, ordinary)(source, setobj, obj)
- self.readstream(obj, source)
- obj.indirect = True
- return obj
-
- def readstream(obj, source):
- ''' Read optional stream following a dictionary
- object.
- '''
- tok = source.next()
- if tok == 'endobj':
- return # No stream
-
- assert isinstance(obj, PdfDict)
- assert tok == 'stream', tok
- fdata = source.fdata
- floc = fdata.rindex(tok, 0, source.floc) + len(tok)
- ch = fdata[floc]
- if ch == '\r':
- floc += 1
- ch = fdata[floc]
- assert ch == '\n'
- startstream = floc + 1
- endstream = startstream + int(obj.Length)
- obj._stream = fdata[startstream:endstream]
- source = PdfTokens(fdata, endstream)
- endit = source.multiple(2)
- if endit != 'endstream endobj'.split():
- # /Length attribute is broken, try to read stream
- # anyway disregarding the specified value
- # TODO: issue warning here once we have some kind of
- # logging
- endstream = fdata.index('endstream', startstream)
- if fdata[endstream-2:endstream] == '\r\n':
- endstream -= 2
- elif fdata[endstream-1] in ['\n', '\r']:
- endstream -= 1
- source = PdfTokens(fdata, endstream)
- endit = source.multiple(2)
- assert endit == 'endstream endobj'.split()
- obj.Length = str(endstream-startstream)
- obj._stream = fdata[startstream:endstream]
- readstream = staticmethod(readstream)
-
- def readarray(self, source, setobj=lambda x:None, original=None):
- special = self.special
- result = PdfArray()
- setobj(result)
-
- for value in source:
- if value == ']':
- break
- if value in special:
- value = special[value](source)
- elif value == 'R':
- generation = result.pop()
- value = self.readindirect(result.pop(), generation)
- result.append(value)
- return result
-
- def readdict(self, source, setobj=lambda x:None, original=None):
- special = self.special
- result = PdfDict()
- setobj(result)
-
- tok = source.next()
- while tok != '>>':
- assert tok.startswith('/'), (tok, source.multiple(10))
- key = tok
- value = source.next()
- if value in special:
- value = special[value](source)
- tok = source.next()
- else:
- tok = source.next()
- if value.isdigit() and tok.isdigit():
- assert source.next() == 'R'
- value = self.readindirect(value, tok)
- tok = source.next()
- result[key] = value
-
- return result
-
- def readxref(fdata):
- startloc = fdata.rindex('startxref')
- xrefinfo = list(PdfTokens(fdata, startloc, False))
- assert len(xrefinfo) == 3, xrefinfo
- assert xrefinfo[0] == 'startxref', xrefinfo[0]
- assert xrefinfo[1].isdigit(), xrefinfo[1]
- assert xrefinfo[2].rstrip() == '%%EOF', repr(xrefinfo[2])
- return startloc, PdfTokens(fdata, int(xrefinfo[1]))
- readxref = staticmethod(readxref)
-
- def parsexref(self, source):
- tok = source.next()
- assert tok == 'xref', tok
- while 1:
- tok = source.next()
- if tok == 'trailer':
- break
- startobj = int(tok)
- for objnum in range(startobj, startobj + int(source.next())):
- offset = int(source.next())
- generation = int(source.next())
- if source.next() == 'n':
- objid = self.fdata, objnum, generation
- objval = [offset, self.unresolved]
- self.indirect_objects.setdefault(objid, objval)
-
- pagename = PdfName.Page
- pagesname = PdfName.Pages
-
- def readpages(self, node):
- # PDFs can have arbitrarily nested Pages/Page
- # dictionary structures.
- if node.Type == self.pagename:
- return [node]
- assert node.Type == self.pagesname, node.Type
- result = []
- for node in node.Kids:
- result.extend(self.readpages(node))
- return result
-
- def __init__(self, fname=None, fdata=None, decompress=True):
-
- if fname is not None:
- assert fdata is None
- # Allow reading preexisting streams like pyPdf
- if hasattr(fname, 'read'):
- fdata = fname.read()
- else:
- f = open(fname, 'rb')
- fdata = f.read()
- f.close()
-
- assert fdata is not None
- fdata = fdata.rstrip('\00')
- self.private.fdata = fdata
-
- self.private.indirect_objects = {}
- self.private.special = {'<<': self.readdict, '[': self.readarray}
-
- startloc, source = self.readxref(fdata)
- self.parsexref(source)
- assert source.next() == '<<'
- self.update(self.readdict(source))
- assert source.next() == 'startxref' and source.floc > startloc
- self.private.pages = self.readpages(self.Root.Pages)
- if decompress:
- self.uncompress()
-
- # For compatibility with pyPdf
- self.private.numPages = len(self.pages)
-
-
- # For compatibility with pyPdf
- def getPage(self, pagenum):
- return self.pages[pagenum]
-
- def uncompress(self):
- uncompress([x[1] for x in self.indirect_objects.itervalues()])
-